Introduction

This document is a walkthrough of how to use the code for the Turnout Tracker. For a discussion of the math and the model, see Turnout Tracker Math

Before Election Day: Fitting the Model

There are a number of parameters that need to be fit on historical data: baseline turnout rates, precinct covariances, etc.

Each election should have a config, which I’ve created in config.R. config is a list with the following items:

library(tidyverse)

source("config.R")
print(config)
## $city
## [1] "Philadelphia"
## 
## $city_filename
## [1] "philadelphia"
## 
## $timezone
## [1] "America/New_York"
## 
## $election_ds
## [1] "2018-11-06"
## 
## $start_hour
## [1] 7
## 
## $end_hour
## [1] 20
## 
## $precinct_shp_path
## [1] "data/2016_Ward_Divisions.shp"
## 
## $get_precinct_id
## function (df) 
## df$WARD_DIVSN
## 
## $get_ward_from_precinct
## function (precinct) 
## substr(precinct, 1, 2)
## 
## $turnout_df_path
## [1] "data/phila_turnout.csv"
## 
## $submission_bitly
## [1] "http://bit.ly/sixtysixturnout"
## 
## $google_doc
## [1] "docs.google.com/spreadsheets/d/1GCPVCim0T5Kt4Qveotibx8pDyR2ZPVlCjpUFAMPy9F4"
## 
## $ref_turnout
##   2014   2016 
## 381503 724394 
## 
## $site_name
## [1] "Sixty-Six Wards"
## 
## $precinct_name
## [1] "division"
## 
## $ward_name
## [1] "ward"
## 
## $map_legend_pos
## [1] 0.7 0.1

The helper function prep_shapefile will load the shapefiles, process them, and then save sf objects precincts.Rda and `wards.Rda in data.

source("../prep_shapefiles.R", chdir = TRUE)
prep_shapefile(
  config$precinct_shp_path,
  config$get_precinct_id,
  config$get_ward_from_precinct
)
## Reading layer `2016_Ward_Divisions' from data source `C:\Users\Jonathan Tannen\Dropbox\sixty_six\posts\election_day_tracker\tracker_v0\phila_example\data\2016_Ward_Divisions.shp' using driver `ESRI Shapefile'
## Simple feature collection with 1686 features and 5 fields
## geometry type:  POLYGON
## dimension:      XY
## bbox:           xmin: 2660575 ymin: 204817.1 xmax: 2750115 ymax: 304942.4
## epsg (SRID):    NA
## proj4string:    +proj=lcc +lat_1=39.93333333333333 +lat_2=40.96666666666667 +lat_0=39.33333333333334 +lon_0=-77.75 +x_0=600000 +y_0=0 +datum=NAD83 +units=us-ft +no_defs

Before election day, we need to calculate the historic fixed effects and correlations. All of the prep work is done in calc_params. The input is a dataframe, turnout_df, which has columns precinct, year, turnout. Precinct is the unique identifier for the precinct, year is the year, and turnout is the voter count. You will need to crosswalk turnout to the present-day precincts if boundaries have changed.

df <- read_csv(config$turnout_df_path, col_types = "cci")
head(arrange(df, precinct, year))
## # A tibble: 6 x 3
##   precinct  year turnout
##      <chr> <chr>   <int>
## 1     0101  2002     185
## 2     0101  2003     213
## 3     0101  2004     311
## 4     0101  2005      62
## 5     0101  2006     188
## 6     0101  2007     133

We can now calculate the historic modelParams:

source("../calc_params.R", chdir=TRUE)

params <- calc_params(
  turnout_df=df, 
  n_svd=3
) 
## [1] "Fitting fixed effects"
## [1] "Calculating svd"
## [1] "Fitted vs True values, check for similarity:"
## [1] "Fitted:"
##             [,1]         [,2]        [,3]       [,4]         [,5]
## [1,] -0.20147055 -0.228310184 -0.20506925 0.09831136 -0.094680904
## [2,] -0.22472082 -0.244379870 -0.19986982 0.05236669 -0.102205314
## [3,] -0.13970275 -0.198962023 -0.26695640 0.28773902 -0.084829646
## [4,] -0.14382335 -0.176291437 -0.21788574 0.12180460 -0.090411349
## [5,]  0.06151261 -0.003176427 -0.13877028 0.37842999  0.006568597
## [6,] -0.15504926 -0.186517743 -0.05556871 0.22361222 -0.004858795
##             [,6]
## [1,] -0.01240732
## [2,] -0.03115861
## [3,]  0.06533859
## [4,]  0.03109524
## [5,]  0.12490579
## [6,] -0.05945405
## [1] "True:"
##          2002        2003        2004       2005        2006        2007
## 1 -0.13389113 -0.13235832 -0.16260794 0.05179354 -0.18585064 -0.05520636
## 2 -0.24396582 -0.19873952 -0.18205661 0.01752606 -0.05475238 -0.03911382
## 3 -0.15031947 -0.16485037 -0.26297073 0.35098272 -0.10616203 -0.07884505
## 4 -0.07957513 -0.04238100 -0.23558753 0.09394798 -0.15914053 -0.11567130
## 5  0.04711097 -0.01618755 -0.02735383 0.49387924 -0.07024164 -0.07331769
## 6  0.07082694 -0.11549764  0.03684136 0.19896227 -0.13399209 -0.20416776
## [1] "Calculating covariances"
## params has a copy of turnout_df, with some new columns.
print(head(params@turnout_df))
## # A tibble: 6 x 5
##   precinct  year turnout log_turnout precinct_num
##     <fctr> <chr>   <int>       <dbl>        <dbl>
## 1     0101  2002     185    5.225747            1
## 2     0101  2003     213    5.365976            1
## 3     0101  2004     311    5.743003            1
## 4     0101  2005      62    4.143135            1
## 5     0101  2006     188    5.241747            1
## 6     0101  2007     133    4.897840            1
## params has an estimate of the year_fe, on the log scale.
print(head(params@year_fe))
## # A tibble: 6 x 2
##    year  year_fe
##   <chr>    <dbl>
## 1  2002 5.401706
## 2  2003 5.540403
## 3  2004 5.947680
## 4  2005 4.133410
## 5  2006 5.469666
## 6  2007 4.995115
## params has an estimate of the precinct_fe, on the log scale.
print(head(params@precinct_fe))
## # A tibble: 6 x 2
##   precinct precinct_fe
##     <fctr>       <dbl>
## 1     0101 -0.04206852
## 2     0102  0.17979758
## 3     0103  0.43558850
## 4     0104  0.23855043
## 5     0105 -0.33682950
## 6     0106 -0.32503880
## params has the svd results, which is used for the covariance (more on this later).
print(head(params@svd$u))
##             [,1]         [,2]        [,3]
## [1,] -0.02754661 -0.017978884 0.015274183
## [2,] -0.02616671 -0.021699299 0.014028596
## [3,] -0.03819923 -0.005953171 0.020693615
## [4,] -0.02900567 -0.010686186 0.009020878
## [5,] -0.02358425  0.017156977 0.017229155
## [6,] -0.01263178 -0.012173448 0.037379013
print(head(params@svd$v))
##             [,1]       [,2]        [,3]
## [1,]  0.08213496 0.40051557 -0.08990939
## [2,]  0.13848406 0.33597093 -0.15311784
## [3,]  0.32637105 0.05208209  0.11209786
## [4,] -0.21698794 0.44557065  0.55918216
## [5,]  0.11583743 0.13598768  0.12836188
## [6,] -0.13066026 0.23591134 -0.09771640
## params has the estimated covariance matrix among precincts (and its inverse)
print(params@precinct_cov[1:6, 1:6])
##            [,1]       [,2]       [,3]       [,4]       [,5]       [,6]
## [1,] 0.05039988 0.03955523 0.04671144 0.03700001 0.02062252 0.02495823
## [2,] 0.03955523 0.05066671 0.04492075 0.03628081 0.01760113 0.02485714
## [3,] 0.04671144 0.04492075 0.07272777 0.04640640 0.03629812 0.02888068
## [4,] 0.03700001 0.03628081 0.04640640 0.04691879 0.02376446 0.02097986
## [5,] 0.02062252 0.01760113 0.03629812 0.02376446 0.04257400 0.01358890
## [6,] 0.02495823 0.02485714 0.02888068 0.02097986 0.01358890 0.03582176

I also provide some helper functions to make diagnostic plots. These require an sf object with the precinct shapefiles. (The outputs of prep_shapefile suffice).

The diagnostics include plots of (a) the fixed effects by precinct and by year, and (b) the svd components for the estimated covariances, along with each dimension’s score in each year. You should sanity check that the combination of precincts and elections make sense.

library(sf)
divs <- safe_load("data/precincts.Rda")
head(divs)
## Simple feature collection with 6 features and 7 fields
## geometry type:  POLYGON
## dimension:      XY
## bbox:           xmin: 2670826 ymin: 239354.5 xmax: 2684800 ymax: 240827.4
## epsg (SRID):    NA
## proj4string:    +proj=lcc +lat_1=39.93333333333333 +lat_2=40.96666666666667 +lat_0=39.33333333333334 +lon_0=-77.75 +x_0=600000 +y_0=0 +datum=NAD83 +units=us-ft +no_defs
##   YEAR WARD_DIVSN WARD DIVSN AREA_SFT precinct ward
## 1 2016       3403   34     3   629193     3403   34
## 2 2016       0605    6     5  1052544     0605   06
## 3 2016       0420    4    20   725059     0420   04
## 4 2016       0413    4    13   841331     0413   04
## 5 2016       0602    6     2   890778     0602   06
## 6 2016       2411   24    11   956738     2411   24
##                         geometry
## 1 POLYGON ((2671041.01971254 ...
## 2 POLYGON ((2681066.86908339 ...
## 3 POLYGON ((2672172.5558353 2...
## 4 POLYGON ((2672189.19031645 ...
## 5 POLYGON ((2680101.55113213 ...
## 6 POLYGON ((2684655.94097348 ...
diagnostics(params, divs)
## [1] "Plotting Diagnostics..."

## Press <Enter> to continue...

## Press <Enter> to continue...

## Press <Enter> to continue...

## Press <Enter> to continue...

## Press <Enter> to continue...

## Press <Enter> to continue...

## Press <Enter> to continue...

## Press <Enter> to continue...

The plots look good. Dimension 1 is blue for Hispanic North Philly and the University of Pennsylvania, and the line plot shows that these precincts had disproportionately high turnout in 2004, 2008, 2012, 2016 (the presidential elections). Dimension 2 has captured population change (red divisions are increasing, blue divisions are decreasing). Dimension 3 is hard to interpret, and may be noise/misfitting…

Let’s save the results and move on.

save_with_backup(params, stem="params", dir="outputs")

Testing on Fake Data

An important validation is to test the model on a fake, known distribution. The function load_data will either load data from our google-form download (later), or create a fake dataset with an S-curve.

source("../fit_submissions.R", chdir=TRUE)

data_list <- load_data(use_real_data=FALSE, params=params, election_config=config)
raw_data <- data_list$raw_data
fake_data <- data_list$fake_data

print("True Turnout to be estimated")
## [1] "True Turnout to be estimated"
fake_data$true_turnout
## [1] 784534.9
em_fit <- fit_em_model(
  raw_data, params, verbose=FALSE, tol=1e-10, use_inverse=FALSE, election_config = config
)
## [1] "n_iter = 40"
fit <- process_results(
  em_fit$precinct_re_fit, 
  em_fit$loess_fit, 
  raw_data,
  em_fit$resid,
  params,
  election_config=config,
  plots = TRUE, 
  save_results = FALSE,
  fake_data=fake_data
)
## [1] "predicting loess"
## [1] "plots"

## Press <Enter> to continue...

## Press <Enter> to continue...

## Press <Enter> to continue...

## Press <Enter> to continue...
## [1] "div_turnout"
## [1] "time_df"
## [1] "full_predictions"
print("Estimate:")
## [1] "Estimate:"
fit$full_predictions %>% 
  filter(time_of_day == max(time_of_day)) %>%
  with(sum(prediction))
## [1] 702665.3

But we don’t want a single estimate, we want a bootstrap of estimates. This can take a few minutes…:

source("../bootstrap.R", chdir=TRUE)

bs <- fit_bootstrap(
  raw_data,
  params,
  election_config=config,
  n_boot=100,
  use_inverse=FALSE,
  verbose=FALSE
)
## [1] "Raw Result"
## [1] "n_iter = 40"
## [1] "n_iter = 55"
## [1] "n_iter = 69"
## [1] "n_iter = 60"
## [1] "n_iter = 53"
## [1] "n_iter = 55"
## [1] "n_iter = 59"
## [1] "n_iter = 51"
## [1] "n_iter = 61"
## [1] "n_iter = 57"
## [1] "n_iter = 58"
## [1] "n_iter = 60"
## [1] "n_iter = 54"
## [1] "n_iter = 52"
## [1] "n_iter = 53"
## [1] "n_iter = 53"
## [1] "n_iter = 59"
## [1] "n_iter = 55"
## [1] "n_iter = 53"
## [1] "n_iter = 57"
## [1] "n_iter = 52"
## [1] "n_iter = 60"
## [1] "n_iter = 54"
## [1] "n_iter = 58"
## [1] "n_iter = 58"
## [1] "n_iter = 56"
## [1] "n_iter = 51"
## [1] "n_iter = 61"
## [1] "n_iter = 58"
## [1] "n_iter = 59"
## [1] "n_iter = 56"
## [1] "n_iter = 56"
## [1] "n_iter = 49"
## [1] "n_iter = 59"
## [1] "n_iter = 60"
## [1] "n_iter = 59"
## [1] "n_iter = 61"
## [1] "n_iter = 68"
## [1] "n_iter = 57"
## [1] "n_iter = 57"
## [1] "n_iter = 58"
## [1] "n_iter = 47"
## [1] "n_iter = 56"
## [1] "n_iter = 66"
## [1] "n_iter = 55"
## [1] "n_iter = 59"
## [1] "n_iter = 57"
## [1] "n_iter = 59"
## [1] "n_iter = 56"
## [1] "n_iter = 64"
## [1] "n_iter = 55"
## [1] "n_iter = 58"
## [1] "n_iter = 57"
## [1] "n_iter = 56"
## [1] "n_iter = 60"
## [1] "n_iter = 60"
## [1] "n_iter = 59"
## [1] "n_iter = 56"
## [1] "n_iter = 59"
## [1] "n_iter = 54"
## [1] "n_iter = 59"
## [1] "n_iter = 62"
## [1] "n_iter = 60"
## [1] "n_iter = 57"
## [1] "n_iter = 56"
## [1] "n_iter = 60"
## [1] "n_iter = 58"
## [1] "n_iter = 61"
## [1] "n_iter = 64"
## [1] "n_iter = 61"
## [1] "n_iter = 57"
## [1] "n_iter = 46"
## [1] "n_iter = 58"
## [1] "n_iter = 55"
## [1] "n_iter = 56"
## [1] "n_iter = 57"
## [1] "n_iter = 57"
## [1] "n_iter = 58"
## [1] "n_iter = 59"
## [1] "n_iter = 56"
## [1] "n_iter = 54"
## [1] "n_iter = 60"
## [1] "n_iter = 60"
## [1] "n_iter = 58"
## [1] "n_iter = 61"
## [1] "n_iter = 67"
## [1] "n_iter = 59"
## [1] "n_iter = 54"
## [1] "n_iter = 55"
## [1] "n_iter = 59"
## [1] "n_iter = 65"
## [1] "n_iter = 54"
## [1] "n_iter = 53"
## [1] "n_iter = 60"
## [1] "n_iter = 52"
## [1] "n_iter = 56"
## [1] "n_iter = 57"
## [1] "n_iter = 59"
## [1] "n_iter = 58"
## [1] "n_iter = 56"
## [1] "n_iter = 55"
## [1] "BS Turnout: 671534 699693 726493"
gg_bs_hist <- hist_bootstrap(bs) 
print(gg_bs_hist)

gg_turnout <- turnout_plot(
  bs,
  raw_data,
  config
)
print(gg_turnout)

save_with_backup(
  bs,
  stem="bootstrap",
  dir="outputs"
)  

for(plotname in c("gg_turnout", "gg_bs_hist")){
  ggsave_with_backup(
    get(plotname), 
    filestem=plotname,
    plottype="png",
    width = 7,
    height=7,
    dir='outputs'
  )
}

On Election Day

There are a few specifics that need to be handled on election day. The file run_all.R does four things: (a) download the google data, (b) calculate the bootstrapped estimate, (c) run election_tracker.Rmd, which creates the markdown report, and then (c) push the html to github using upload_git.bat.

On election day, you just run run_all.R and leave your computer on.